References
import time
time_start_notebook = time.time()
%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
# usual imports
!pip install watermark
!pip install scikit-plot
# HPO
!git clone https://github.com/thuijskens/scikit-hyperband.git
sys.path.append('scikit-hyperband/hyperband')
# update modules
!pip uninstall xgboost
!pip install -U xgboost
print('Environment: Google Colab')
from search import HyperbandSearchCV
import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import matplotlib.pyplot as plt
import joblib
from tqdm import tqdm_notebook as tqdm
import plotly_express as px
# modelling
from sklearn.preprocessing import OneHotEncoder
import imblearn
from imblearn.over_sampling import SMOTE
import sklearn.metrics as skmetrics
# pipeline
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
# boosting
from lightgbm import LGBMClassifier
# settings
sns.set()
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%matplotlib inline
%load_ext watermark
%watermark -iv
plotly_express 0.4.1 imblearn 0.7.0 json 2.0.9 autopep8 1.5.2 pandas 1.1.4 numpy 1.19.4 seaborn 0.11.0 joblib 0.17.0
def show_methods(obj, ncols=4,contains=None):
lst = [i for i in dir(obj) if i[0]!='_' ]
if contains is not None:
lst = [i for i in lst if contains in i]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
def model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=True):
import sklearn.metrics as skmetrics
import scikitplot.metrics as skpmetrics
import os
acc = skmetrics.accuracy_score(ytest,ypreds)
precision = skmetrics.precision_score(ytest,ypreds)
recall = skmetrics.recall_score(ytest,ypreds)
f1 = skmetrics.f1_score(ytest,ypreds)
auc = skmetrics.roc_auc_score(ytest,ypreds)
print(skmetrics.classification_report(ytest,ypreds))
print(skmetrics.confusion_matrix(ytest,ypreds))
df_res = pd.DataFrame({'Accuracy':[acc],
'Precision': [precision],
'Recall': [recall],
'F1-score': [f1],
'AUC': [auc]},index=[model_name])
display(df_res.style.format("{:.4f}"))
if not os.path.isdir('../outputs'):
os.makedirs('../outputs')
o = '.' if ENV_COLAB else '../outputs/'
df_res.to_csv(o+f'model_{model_name}.csv',index=True)
if show_plots:
skpmetrics.plot_precision_recall(ytest,yprobs2d) # more focus on minority
skpmetrics.plot_roc_curve(ytest,yprobs2d) # equal focus on both groups
skpmetrics.plot_confusion_matrix(ytest,ypreds)
class FrequencyEncoder:
def __init__(self, cols):
self.cols = cols
self.counts_dict = None
def fit(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
counts_dict = {}
for col in self.cols:
values, counts = np.unique(X[col], return_counts=True)
counts_dict[col] = dict(zip(values, counts))
self.counts_dict = counts_dict
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
counts_dict_test = {}
res = []
for col in self.cols:
values, counts = np.unique(X[col], return_counts=True)
counts_dict_test[col] = dict(zip(values, counts))
# if value is in "train" keys - replace "test" counts with "train" counts
for k in [key for key in counts_dict_test[col].keys() if key in self.counts_dict[col].keys()]:
counts_dict_test[col][k] = self.counts_dict[col][k]
res.append(X[col].map(counts_dict_test[col]).values.reshape(-1, 1))
res = np.hstack(res)
X[self.cols] = res
return X
def fit_transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
self.fit(X, y)
X = self.transform(X)
return X
path_data_train = '../data/raw/train.csv'
path_data_test = '../data/raw/test.csv'
if ENV_COLAB:
path_data_train = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/train.csv'
path_data_test = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/test.csv'
df_train = pd.read_csv(path_data_train)
df_test = pd.read_csv(path_data_test)
print(df_train.shape)
print(df_test.shape)
df_train.head(2).append(df_train.tail(2))
(5634, 21) (1409, 21)
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1621-YNCJH | Female | 0 | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.4 | No |
| 1 | 7143-BQIBA | Male | 0 | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 | No |
| 5632 | 0862-PRCBS | Female | 0 | Yes | Yes | 68 | Yes | Yes | Fiber optic | No | Yes | No | Yes | Yes | Yes | Two year | Yes | Credit card (automatic) | 103.75 | 7039.45 | No |
| 5633 | 4656-CAURT | Male | 0 | No | No | 69 | Yes | Yes | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | Two year | No | Bank transfer (automatic) | 23.95 | 1713.1 | No |
target_name = 'Churn'
import plotly_express as px
px.histogram(df_train, x=target_name,height=300,width=300)
px.histogram(df_train, x='gender', color=target_name,height=300,width=300)
df_Xtrain = df_train.drop(target_name,axis=1)
df_Xtest = df_test.drop(target_name,axis=1)
ser_ytrain = df_train[target_name].map({'No':0,'Yes':1})
ser_ytest = df_test[target_name].map({'No':0,'Yes':1})
ytrain = np.array(ser_ytrain).flatten()
ytest = np.array(ser_ytest).flatten()
df_Xtrain.head(2)
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1621-YNCJH | Female | 0 | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.4 |
| 1 | 7143-BQIBA | Male | 0 | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 |
cols_num = list(df_train.select_dtypes('number').columns)
cols_num
['SeniorCitizen', 'tenure', 'MonthlyCharges']
cols_cat = list(df_train.select_dtypes('object').columns)
# gender is no good predictor as seen in EDA
cols_exclude = ['customerID','gender','TotalCharges'] + [target_name]
cols_cat = [ i for i in cols_cat if i not in cols_exclude ] + ['SeniorCitizen']
print(cols_cat)
['Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'SeniorCitizen']
cols_num1 = ['TotalCharges']
cols_num2 = ['tenure', 'MonthlyCharges']
cols_drop = ['customerID','gender']
ColumnTransformer(
transformers,
*,
remainder='drop',
sparse_threshold=0.3,
n_jobs=None,
transformer_weights=None,
verbose=False,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
def make_numeric_and_impute(dfx,col):
dfx[col] = pd.to_numeric(dfx[col],errors='coerce').fillna(0)
return dfx
pipe_num1 = Pipeline([
('imputer', FunctionTransformer(make_numeric_and_impute,
kw_args={'col':'TotalCharges'})),
('scaler', StandardScaler())
])
pipe_num2 = Pipeline([
('scaler', StandardScaler())
])
pipe_cat = Pipeline([
('ohe', OneHotEncoder(handle_unknown='ignore'))
])
pipe_cat_freq = Pipeline([
('freq_enc', FrequencyEncoder(cols=cols_cat)),
])
preprocessor = ColumnTransformer(
transformers=[
('num1', pipe_num1, cols_num1),
('num2', pipe_num2, cols_num2),
('cat', pipe_cat, cols_cat)
],
remainder='drop'
)
lgb.LGBMClassifier(
boosting_type = 'gbdt',
num_leaves = 31,
max_depth = -1,
learning_rate = 0.1,
n_estimators = 100,
subsample_for_bin = 200000,
objective = None,
class_weight = None,
min_split_gain = 0.0,
min_child_weight = 0.001,
min_child_samples = 20,
subsample = 1.0,
subsample_freq = 0,
colsample_bytree = 1.0,
reg_alpha = 0.0,
reg_lambda = 0.0,
random_state = None,
n_jobs = -1,
silent = True,
importance_type = 'split',
**kwargs,
)
from lightgbm import LGBMClassifier
pipe = Pipeline([
('preprocessor', preprocessor),
('model', LGBMClassifier(random_state=SEED))
])
from sklearn import set_config
set_config(display='diagram')
pipe
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num1',
Pipeline(steps=[('imputer',
FunctionTransformer(func=,
kw_args={'col': 'TotalCharges'})),
('scaler',
StandardScaler())]),
['TotalCharges']),
('num2',
Pipeline(steps=[('scaler',
StandardScaler())]),
['tenure', 'MonthlyCharges']),
('cat',
Pipeline(steps=[('ohe',
OneHotEncoder(handle_unknown='ignore'))]),
['Partner', 'Dependents',
'PhoneService',
'MultipleLines',
'InternetService',
'OnlineSecurity',
'OnlineBackup',
'DeviceProtection',
'TechSupport', 'StreamingTV',
'StreamingMovies',
'Contract',
'PaperlessBilling',
'PaymentMethod',
'SeniorCitizen'])])),
('model', LGBMClassifier(random_state=100))]) ColumnTransformer(transformers=[('num1',
Pipeline(steps=[('imputer',
FunctionTransformer(func=,
kw_args={'col': 'TotalCharges'})),
('scaler', StandardScaler())]),
['TotalCharges']),
('num2',
Pipeline(steps=[('scaler', StandardScaler())]),
['tenure', 'MonthlyCharges']),
('cat',
Pipeline(steps=[('ohe',
OneHotEncoder(handle_unknown='ignore'))]),
['Partner', 'Dependents', 'PhoneService',
'MultipleLines', 'InternetService',
'OnlineSecurity', 'OnlineBackup',
'DeviceProtection', 'TechSupport',
'StreamingTV', 'StreamingMovies', 'Contract',
'PaperlessBilling', 'PaymentMethod',
'SeniorCitizen'])]) ['TotalCharges']
FunctionTransformer(func=, kw_args={'col': 'TotalCharges'})
StandardScaler()
['tenure', 'MonthlyCharges']
StandardScaler()
['Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'SeniorCitizen']
OneHotEncoder(handle_unknown='ignore')
LGBMClassifier(random_state=100)
from sklearn import set_config
set_config(display='text')
pipe.fit(df_Xtrain,ytrain);
ypreds = pipe.predict(df_Xtest)
yprobs2d = pipe.predict_proba(df_Xtest)
model_eval_bin('lightgbm',ytest,ypreds,yprobs2d,show_plots=True)
precision recall f1-score support
0 0.84 0.88 0.86 1035
1 0.63 0.55 0.58 374
accuracy 0.79 1409
macro avg 0.74 0.71 0.72 1409
weighted avg 0.79 0.79 0.79 1409
[[914 121]
[170 204]]
| Accuracy | Precision | Recall | F1-score | AUC | |
|---|---|---|---|---|---|
| lightgbm | 0.7935 | 0.6277 | 0.5455 | 0.5837 | 0.7143 |
/Users/poudel/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/sklearn/utils/deprecation.py:86: FutureWarning: Function plot_roc_curve is deprecated; This will be removed in v0.5.0. Please use scikitplot.metrics.plot_roc instead.
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))
Time taken to run whole notebook: 0 hr 0 min 4 secs